# -*- coding: utf-8 -*- """ Created on Mon Jan 3 16:28:27 2022 @author: 玦祎 """ import re import pandas as pd import openpyxl import time import requests xlsx = '半导体行业.xlsx' df = pd.read_excel(xlsx) exf = openpyxl.load_workbook(xlsx)#工作簿 sheet = exf.active#只提取在处理的表 C2 = sheet['C2']# C = sheet['C'] links = [c.value for c in C] links_1=links[1:-1] links_2=''.join(links_1) p = re.compile('"(.*?)","(.*?)"') list_of_tuple = p. findall(links_2) df2 = pd.DataFrame({'link': [t[0] for t in list_of_tuple],'f_name': [t[1] for t in list_of_tuple]}) df2.to_csv('半导体行业.csv') import re import pandas as pd import openpyxl import os f = open('半导体行业.csv',encoding='utf-8') df = pd.read_csv(f) def filter_links(words,df,include=True): ls = [] for word in words: if include: ls.append([word in f for f in df.f_name]) else: ls.append([word not in f for f in df.f_name]) index = [] for r in range(len(df)): flag = not include for c in range(len(words)): if include: flag = flag or ls[c][r] else: flag = flag and ls[c][r] index.append(flag) df2 = df[index] return(df2) df_all = filter_links(['摘要','问询函','社会责任','审计','财务','风险','债券'],df,include=False) df_orig = filter_links(['(','('],df_all,include=(False)) df_updt =filter_links(['(','('],df_all,include=(True)) df_updt =filter_links(['取消'],df_updt,include=(False)) def sub_with_update(df_updt,df_orig): df_newest = df_orig.copy() index_orig = [] index_updt = [] for i, f in enumerate(df_orig.f_name): for j,fn in enumerate(df_updt.f_name): if f in fn: index_orig.append(i) index_updt.append(j) #return((index_orig,index_updt)) for n in range(len(index_orig)): i = index_orig[n] j = index_updt[n] df_newest.iloc[i,-2] = df_updt.iloc[j,-2] #df_newest.iloc[i,-1] = df_updt.iloc[j,-1] return(df_newest) df_newest =sub_with_update(df_updt, df_orig) df_newest.sort_values(by=['f_name'], inplace=True,ignore_index=True) df_newest['公司简称']=[f[:4] for f in df_newest.f_name] counts = df_newest['公司简称'].value_counts() ten_company = [] for cn in counts.index[:10]: ten_company.append(filter_links([cn], df_newest)) if not os.path.exists('10companies'): os.makedirs('10companies') for df_com in ten_company: cn = df_com['公司简称'].iloc[0] df_com.to_csv('10companies/%s.csv' % cn) ten_csv =os.listdir('10companies') #将不同公司的年报链接分别储存在不同的csv文件 #运行选出的公司有两家公司通过代码无法获得年报链接的csv,手动删除 # 通过提取下载后csv文件里的链接,下载各家公司各个年份的年报pdf文件 import re import os import requests import pandas as pd import time #利用for循环对文件夹中十个csv文件分别处理获取链接 for info in os.listdir('10companies'): domain = os.path.abspath(r'10companies') #获取文件夹的路径 info = os.path.join(domain,info) #将路径与文件名结合起来就是每个文件的完整路径 df = pd.read_csv(info) links = df["link"];f_names = df["f_name"] def get_PDF_url(url): r = requests.get(url); r.encoding = 'utf-8'; html = r.text r.close() # 已获取html内容,结束connection p = re.compile('(.*?)', re.DOTALL) a = p.search(html) # 因第一个即是目标标签,故用search if a is None: Warning('没有找到下载链接。请手动检查链接:%s' % url) return() else: href = a.group(1); fname = a.group(2).strip() href = r.url[:26] + href # 形成完整的链接 return((href,fname)) hrefs = []; fnames = [] for link in links: href,fname = get_PDF_url(link) hrefs.append(href) fnames.append(fname) time.sleep(0) df_final_links = pd.DataFrame({'href': hrefs, 'f_name': fnames}) ste = info[-8:-4]#将各个公司的名称赋予ste变量 df_final_links.to_csv("final_links_"+ste+".csv") import os import requests import pandas as pd import time for info in os.listdir('10companies'):#通过for循环对不同csv文件分别进行处理 domain = os.path.abspath(r'10companies') #获取文件夹的路径 info = os.path.join(domain,info) #将路径与文件名结合起来就是每个文件的完整路径 df = pd.read_csv(info) ste = info[-8:-4] df_final_links = pd.read_csv("final_links_"+ste+".csv") hrefs = df_final_links["href"] f_names = df_final_links["f_name"] for i in range(len(hrefs)):#对每个csv文件中已生成的链接通过for循环进行下载 href = hrefs[i];f_name = f_names[i] r = requests.get(href,allow_redirects=True) open('%s'%f_name,'wb').write(r.content) time.sleep(0) r.close() import os import re import fitz # pip install pymupdf import csv import pandas as pd import matplotlib.pyplot as plt plt.rcParams['font.sans-serif'] = ['SimHei'] #确保显示中文 plt.rcParams['axes.unicode_minus'] = False #确保显示负数的参数设置 filenames = os.listdir() prefix = '年度报告' pdf = [f for f in filenames if f.endswith('.pdf')] year = [f[-13:-4] for f in pdf] gs=[f[:4] for f in pdf] def getText(pdf): text = '' doc = fitz.open(pdf) for page in doc: text += page.getText() doc.close() return(text) text = [getText(f) for f in pdf] text[0] def get_content(pdf): text = getText(pdf) p = re.compile('第二节\s*公司简介和主要财务指标(.*)第三节\s*公司业务概要',re.DOTALL) content = p.search(text).group(0) return(content) content=[get_content(f) for f in pdf]